utterances <- get_utterances(collection = "Eng-NA")
cleanutts <- utterances %>%
filter(speaker_role %in% c("Target_Child", "Mother","Father")) %>%
arrange(transcript_id,utterance_order) %>%
mutate(gloss = str_to_lower(gloss))
get_convo <- function(these_transcript_ids) {
convo <- cleanutts %>%
filter(transcript_id %in% these_transcript_ids, gloss != "") %>%
select(gloss, transcript_id, utterance_order, speaker_code,
target_child_age, target_child_id)
return(convo)
}
convos <- get_convo(2798)
import io
import os
import gensim
from gensim import utils
import gensim.models
import gensim.models.word2vec
from gensim.test.utils import datapath
import numpy as np
import sklearn
import matplotlib
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE
model = gensim.models.Word2Vec.load("models/childes_adult_word2vec.model")
def get_vectors(convo):
vec_length = len(model.wv['and'])
convo_length = len(convo)
discourse_vectors = np.zeros((convo_length, vec_length))
for index, utt in enumerate(convo):
sum_vector = np.zeros(vec_length)
utt_len = len(str.split(utt))
for word in str.split(utt):
if word in model.wv.vocab:
sum_vector = sum_vector + model.wv[word]
else:
utt_len = utt_len - 1
if utt_len > 0:
discourse_vectors[index] = [x/utt_len for x in sum_vector]
else:
discourse_vectors[index] = [0.0 for x in sum_vector]
return discourse_vectors
def reduce_dimensions(all_convos):
num_dimensions = 2
all_vectors = get_vectors(all_convos)
vectors = TSNE(n_components=num_dimensions, random_state=0).fit_transform(all_vectors)
x_vals = [v[0] for v in vectors]
y_vals = [v[1] for v in vectors]
vals = np.column_stack((x_vals, y_vals, r.convos["transcript_id"], r.convos["utterance_order"]))
return vals
convo_vecs = reduce_dimensions(r.convos["gloss"])
vizconvos <- convos %>%
left_join(as_tibble(py$convo_vecs), by = c("transcript_id" = "V3",
"utterance_order" = "V4")) %>%
mutate(exchange = floor(utterance_order/4))
ggplot(vizconvos, aes(V1, V2, color = as_factor(speaker_code))) +
geom_point()

ggplot(vizconvos, aes(V1, V2, color = utterance_order)) +
geom_point() +
facet_wrap(~ target_child_age)

fig <- vizconvos %>%
plot_ly(x=~V1, y=~V2, z=~utterance_order, type="scatter3d",
mode="markers", marker = list(size = 3), color=~speaker_code)
fig
fig <- vizconvos %>%
plot_ly(
x = ~V1,
y = ~V2,
color = ~speaker_code,
frame = ~exchange,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
fig